import os
import numpy as np
import pandas as pd


train_dir = "/home/datanfs/macong_data/tencent_data/train_preliminary"
test_dir = "/home/datanfs/macong_data/tencent_data/test.zip"

ad_csvf = "ad.csv"
click_csvf = "click_log.csv"
user_csv = "user.csv"

def csv_desc():
    files = [ad_csvf, click_csvf, user_csv]
    for f in files:
        data_ad = pd.read_csv(os.path.join(train_dir, f))
        print("### file name: ", f)
        print("### shape: ", data_ad.shape)
        print("### describe:", data_ad.describe())
        print("### keys:", data_ad.keys())
        for key in data_ad.keys():
            print(key, data_ad[key].nunique())
        print("### isnull", data_ad.isnull().any())
        print("### info: ", data_ad.info())
        print()
        print("    ##########   ")

def lable_balance():
    user_csv_f = os.path.join(train_dir, user_csv)
    df = pd.read_csv(user_csv_f)
    for key in df.keys():
        print("key:{}  count".format(key))
        print(df.loc[:, key].value_counts())
        print("---")


if __name__ == '__main__':
    lable_balance()
